1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.filter;
28 import java.nio.ByteBuffer;
29 import java.nio.CharBuffer;
30 import java.nio.charset.Charset;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33 import org.apache.log4j.Logger;
34 import org.smartcrawler.common.AbstractParametrizableComponent;
35 import org.smartcrawler.common.Context;
36 import org.smartcrawler.common.SCLogger;
37 import org.smartcrawler.retriever.Content;
38
39 /***
40 *
41 *
42 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
43 * @version <tt>$Revision: 1.3 $</tt>
44 */
45 public class ContainedWordFilter extends AbstractParametrizableComponent implements PostFilterLink {
46
47 private static Logger log = SCLogger.getLogger(ContainedWordFilter.class);
48
49
50 /***
51 *
52 * @param link
53 * @return
54 */
55 public boolean isPermitted(Context conf, Content content) {
56 String keyword = getParameter("keyword");
57 log.debug("isPermitted() BEGIN [keyword=" + keyword + "]");
58
59 boolean res = false;
60
61 boolean isHtml = content.getContentType().indexOf("htm") >= 0;
62
63 if (isHtml) {
64 try {
65 Pattern p =
66 Pattern.compile("(<[^<]*>([^<]*)<[^<]*>)",
67 Pattern.MULTILINE |
68 Pattern.CASE_INSENSITIVE |
69 Pattern.DOTALL);
70 byte[] buffer = content.getBuffer();
71 ByteBuffer bbuf = ByteBuffer.allocate(buffer.length);
72 bbuf.put(buffer);
73 bbuf.flip();
74 CharBuffer charBuf =
75 Charset.forName("8859_1").newDecoder().decode(bbuf);
76 Matcher matcher = p.matcher(charBuf);
77 while (matcher.find()) {
78 CharSequence cs = matcher.group(2);
79
80 if (cs.toString().trim().toLowerCase().contains(keyword)) {
81 res = true;
82 break;
83 }
84 }
85 }catch (Exception e) {
86 log.warn("Unable to apply filter on " + content.getLink());
87 }
88 }
89 log.debug("Checking content res=" + res);
90 log.debug("isPermitted() END");
91 return res;
92 }
93 }